### examine coverage of tea party candidates on cable channels ###

### libraries
library(tidyverse)
library(lubridate)
library(data.table)
library(stringr)
library(ggthemes)
library(quanteda)
library(glue)

theme_set(theme_few())

### set paths
base_dir <- ".."
data_dir <- glue("{base_dir}/data")
transcript_path <- glue("{data_dir}/transcripts") ## note: transcript files must be licensed, not included in archive

### parsing functions
check_contributor <- function(speaker, channel) {
  channame <- c(
    "cnn" = "CNN",
    "fox" = "FOX NEWS",
    "msnbc" = "MSNBC",
    "pbs" = "PBS",
    "nbc" = "NBC NEWS",
    "cbs" = "CBS NEWS",
    "abc" = "ABC NEWS")

  str_detect(speaker, fixed(channame[channel]))
}

drop_headers <- function(
    text,
    headers = c("DOCUMENT-PUBLICATION",
                "LOAD-DATE",
                "DATE",
                "TIME",
                "INTERVIEWERS",
                "LANGUAGE",
                "TYPE",
                "GUESTS",
                "LENGTH",
                "BYLINE",
                "SECTION",
                "DOCUMENT-TYPE",
                "PUBLICATION-TYPE",
                "TRANSCRIPT",
                "SHOW",
                "HIGHLIGHT",
                "MUSIC",
                "NONE")) {
  # remove header rows from transcript text
  header_res <- glue("{headers}:.*\\s*")
  reduce(header_res, str_remove_all, .init = text)
}

drop_prefixes <- function(
  text,
  prefixes = c("MR", "MRS", "MS", "REP", "SEN")) {
  
  prefix_res <- glue("^{prefixes}[.]? ") %>% as.character
  reduce(prefix_res, str_remove, .init = text)
}

drop_speakers <- function(text) {
  # remove speaker names from transctipt text
  text %>% str_remove_all(pattern = "[^a-z.]+:")
}

find_long_version <- function(sp, speakers) {
  # find full version of speaker name, rather than shortened version

  matches <- grep(sp, speakers, value = TRUE)
  if (length(matches) == 0) return(sp)
  get_longest(matches)
}

find_long_version_firstinitial <- function(speakers) {
  matched_pos <- grep("^[A-Z]\\.? [A-Z'-]", speakers)

  if (length(matched_pos) == 0) return(speakers)

  replacements <- speakers[matched_pos] %>%
    map_chr(~ find_long_version(sub("^([A-Z])\\.? ", "^\\1[A-Z]+ ", .), speakers))
  speakers[matched_pos] <- replacements

  speakers
}

get_longest <- function(x) x[which.max(nchar(x))]

parse_speaker_texts <- function(text, sppattern = "^[^a-z]+:") {
    texts <- str_split(text, pattern = regex(sppattern, multiline = T))[[1]] %>%
        str_trim

    speakers <- str_extract_all(text, pattern = regex(sppattern, multiline = T))[[1]] %>%
        str_remove("^\\s*(\\([^)]+\\))?\\W*") %>%
        str_remove_all(".*(\\r|\\n)\\n ?") %>%
        str_remove(":.*$") %>%
        str_trim

    if (length(speakers) == length(texts)) {
        return(list(speaker = speakers, text = texts))
    } else if (length(speakers) == length(texts) - 1) {
        return(list(speaker = speakers, text = texts[2:length(texts)]))
    } else {
        stop(paste0("Speakers and speech length mismatch: ", length(speakers), " vs ", length(texts)))
    }
}

### MAIN LOOP FUNCTIONS
get_raw_text <- function(filename) {
  # read raw transcript file, clean up
  cat(filename, "\n")

  if (grepl("\\d+_(\\d+)-\\d+", filename)) {
    start_num <- as.integer(str_extract(filename, "(?<=_)\\d+(?=-)")) - 1
  } else {
    start_num <- 0
  }

  raw_text <- read_file(filename)

  docs <- str_split(raw_text, "(\\d+) *of *\\d+ *DOCUMENTS\\s+([A-Za-z ]+[A-Za-z])\\s+(\\w+) (\\d+), (\\d+) \\w+")[[1]] %>% 
    str_trim %>%
    discard(~ nchar(.) <= 1) %>%
    str_replace_all(pattern = "\\r\\n(?!\\r)", replacement = " ") %>%
    drop_headers %>%
    drop_speakers
    
  hdrs <- str_match_all(raw_text, "(\\d+) *of *\\d+ *DOCUMENTS\\s+([A-Za-z ]+[A-Za-z])\\s+(\\w+) (\\d+), (\\d+) \\w+")[[1]]

  out <- data.table(show = str_extract(filename, "(?<=(cnn|fox|msnbc|abc|cbs|nbc|pbs)_)[0-9a-z]+"),
           channel = str_extract(filename, "(?<=/)(cnn|fox|msnbc|abc|cbs|nbc|pbs)"),
           year = as.numeric(str_extract(filename, "(?<=_)\\d+(?=[_.])")),
           date = mdy(paste(hdrs[, 4], hdrs[, 5], hdrs[, 6])),
           transcript_number = start_num + as.numeric(hdrs[, 2]),
           text = docs)

  out
}

get_text_by_speaker <- function(filename) {
  # read raw transcript file, clean up, split into segments assigned to speakers
  cat(filename, "\n")

  if (grepl("\\d+_(\\d+)-\\d+", filename)) {
    start_num <- as.integer(str_extract(filename, "(?<=_)\\d+(?=-)")) - 1
  } else {
    start_num <- 0
  }

  raw_text <- read_file(filename)

  docs <- str_split(raw_text, "(\\d+) *of *\\d+ *DOCUMENTS\\s+([A-Za-z ]+[A-Za-z])\\s+(\\w+) (\\d+), (\\d+) \\w+")[[1]] %>%
    str_trim %>%
    discard(~ nchar(.) <= 1) %>%
    str_replace_all(pattern = "\\r\\n(?!\\r)", replacement = " ") %>%
    drop_headers

  hdrs <- str_match_all(raw_text, "(\\d+) *of *\\d+ *DOCUMENTS\\s+([A-Za-z ]+[A-Za-z])\\s+(\\w+) (\\d+), (\\d+) \\w+")[[1]]

  shows <- data.table(show = str_extract(filename, "(?<=(cnn|fox|msnbc|abc|cbs|nbc|pbs)_)[0-9a-z]+"),
           channel = str_extract(filename, "(?<=/)(cnn|fox|msnbc|abc|cbs|nbc|pbs)"),
           year = as.numeric(str_extract(filename, "(?<=_)\\d+(?=[_.])")),
           date = mdy(paste(hdrs[, 4], hdrs[, 5], hdrs[, 6])),
           transcript_number = start_num + as.numeric(hdrs[, 2]),
           text = docs)

  shows[,
    parse_speaker_texts(text),
    by = .(channel, show, year, date, transcript_number)]
}



### BEGIN SCRIPT ###

# raw files all have extension ".txt"
raw_files <- list.files(path = transcript_path,
    pattern = "20(09|10|11)(_\\d+-\\d+)?\\.txt$",
    all.files = FALSE,
    full.names = TRUE,
    recursive = TRUE)


## step 1: parse transcripts (with minimal processing)
raw_transcripts <- raw_files %>%
    map(get_raw_text) %>%
    rbindlist

saveRDS(raw_transcripts, file = glue("{data_dir}/raw_transcripts.rds"))


## step 2: parse transcripts, chunking by speaker
transcripts_by_speaker <- raw_files %>%
  discard(str_detect, pattern = "morningjoe") %>%  ## morning joe is from Newsbank, doesn't follow conventions in labeling speakers
  map(get_text_by_speaker) %>%
  rbindlist

## clean up, and deal with abbreviations for speaker names
## steps are 1) delete metadata entries
##           2) delete unidentified speakers
##           3) look for patterns like "ANDERSON COOPER, HOST" followed by "COOPER" in same transcript, replace latter with former.
##           4) substitute for patterns of the form "EMILY MURTAUGH" followed by "E. MURTAUGH" later in same doc

transcripts_by_speaker <- transcripts_by_speaker %>%
    .[!(speaker %in% c("LOAD-DATE", "LANGUAGE", "DATE", "TIME", "INTERVIEWERS","TYPE", "GUESTS", "LENGTH", "BYLINE", "SECTION", "DOCUMENT-TYPE", "PUBLICATION-TYPE", "TRANSCRIPT", "SHOW", "HIGHLIGHT", "MUSIC", "NONE"))] %>%
    .[, speaker := sub(".*VIDEO CLIP[^)]*(\\)|,) ?", "", speaker)] %>%
    .[, speaker := sub("^\\([^)]*\\) ?", "", speaker)] %>%
    .[speaker != ""] %>%
    .[, speaker := str_trim(gsub("[()]", " ", speaker))] %>%
    .[, speaker := sub("\\W+$", "", speaker)] %>%
    .[, speaker := drop_prefixes(speaker)] %>%
    .[, speaker := map_chr(speaker, ~ find_long_version(., speaker)),
        by = .(channel, show, year, date, transcript_number)] %>%
    .[, speaker := find_long_version_firstinitial(speaker),
        by = .(channel, show, year, date, transcript_number)]


# save
saveRDS(transcripts_by_speaker, file = paste0(data_dir, "transcripts_by_speaker.rds"))

# collapse to speaker-month, save this also
transcripts_by_speaker[, month := floor_date(date, unit = "month")]
transcripts_sp_mo <- transcripts_by_speaker[,
    .(text = paste(text, collapse = ". ")),
    by = .(channel, month, speaker)]
transcripts_sp_mo[, word_count := ntoken(text, what = "word", remove_punct = TRUE)]

transcripts_sp_mo <- transcripts_sp_mo[word_count >= 10]

saveRDS(transcripts_sp_mo, file = paste0(data_dir, "transcripts_sp_mo.rds"))
